In [354]:
import numpy as np
import pandas as pd
import scanpy as sc
sc.settings.verbosity = 3 
sc.logging.print_header()
sc.settings.set_figure_params(dpi=80, facecolor='white')

results_file = 'results_HL.h5ad'
adata_r = sc.read_10x_mtx('/Users/bahawarsdhillon/Desktop/BIO 257 - Applied Genomics/Scanpy-Project/filtered_feature_bc_matrix/', var_names='gene_symbols',cache=False)
adata_r.var_names_make_unique()
adata_r
scanpy==1.6.0 anndata==0.7.4 umap==0.4.6 numpy==1.19.2 scipy==1.5.3 pandas==1.1.3 scikit-learn==0.23.2 statsmodels==0.12.0 python-igraph==0.8.3 leidenalg==0.8.2
--> This might be very slow. Consider passing `cache=True`, which enables much faster reading from a cache file.
Out[354]:
AnnData object with n_obs × n_vars = 3394 × 36601
    var: 'gene_ids', 'feature_types'
In [355]:
sc.pp.filter_cells(adata_r, min_genes=200)
sc.pp.filter_genes(adata_r, min_cells=3)
sc.pp.filter_cells(adata_r, max_counts=39766)
sc.pp.filter_cells(adata_r, max_genes=5942)
adata_r.var['mt'] = adata_r.var_names.str.startswith('MT-')  # annotate the group of mitochondrial genes as 'mt'
sc.pp.calculate_qc_metrics(adata_r, qc_vars=['mt'], percent_top=None, log1p=False, inplace=True)
adata = adata_r[adata_r.obs.pct_counts_mt < 10, :]
filtered out 413 cells that have less than 200 genes expressed
filtered out 16445 genes that are detected in less than 3 cells
filtered out 56 cells that have more than 39766 counts
filtered out 10 cells that have more than 5942 genes expressed
In [356]:
sc.pp.normalize_total(adata, target_sum=1e4)
sc.pp.log1p(adata)
sc.pp.highly_variable_genes(adata, min_mean=0.0125, max_mean=3, min_disp=0.5)
sc.pl.highly_variable_genes(adata)
adata.raw = adata
normalizing counts per cell
    finished (0:00:00)
extracting highly variable genes
    finished (0:00:00)
--> added
    'highly_variable', boolean vector (adata.var)
    'means', float vector (adata.var)
    'dispersions', float vector (adata.var)
    'dispersions_norm', float vector (adata.var)
In [357]:
sc.pp.regress_out(adata, 'pct_counts_mt')
sc.pp.scale(adata, max_value=10)
regressing out pct_counts_mt
    sparse input is densified and may lead to high memory use
... storing 'feature_types' as categorical
    finished (0:01:08)
In [358]:
cell_cycle_genes = [x.strip() for x in open('cell_cycle.txt')]
print(len(cell_cycle_genes))

# Split into 2 lists
s_genes = cell_cycle_genes[:43]
g2m_genes = cell_cycle_genes[43:]

cell_cycle_genes = [x for x in cell_cycle_genes if x in adata.var_names]
print(len(cell_cycle_genes))
97
94
In [359]:
sc.tl.score_genes_cell_cycle(adata, s_genes=s_genes, g2m_genes=g2m_genes)
sc.pl.violin(adata, ['S_score', 'G2M_score'],
             jitter=0.4)
calculating cell cycle phase
computing score 'S_score'
WARNING: genes are not in var_names and ignored: ['MLF1IP']
    finished: added
    'S_score', score of gene set (adata.obs).
    645 total control genes are used. (0:00:00)
computing score 'G2M_score'
WARNING: genes are not in var_names and ignored: ['FAM64A', 'HN1']
    finished: added
    'G2M_score', score of gene set (adata.obs).
    769 total control genes are used. (0:00:00)
-->     'phase', cell cycle phase (adata.obs)
... storing 'phase' as categorical
In [360]:
sc.tl.pca(adata, svd_solver='arpack')
sc.pp.neighbors(adata, n_neighbors=10, n_pcs=40)
sc.tl.umap(adata)
sc.tl.leiden(adata, resolution=0.5)
sc.pl.umap(adata, color=['leiden'])
computing PCA
    on highly variable genes
    with n_comps=50
    finished (0:00:00)
computing neighbors
    using 'X_pca' with n_pcs = 40
    finished: added to `.uns['neighbors']`
    `.obsp['distances']`, distances for each pair of neighbors
    `.obsp['connectivities']`, weighted adjacency matrix (0:00:00)
computing UMAP
    finished: added
    'X_umap', UMAP coordinates (adata.obsm) (0:00:04)
running Leiden clustering
    finished: found 11 clusters and added
    'leiden', the cluster labels (adata.obs, categorical) (0:00:00)
In [361]:
sc.pl.umap(adata, color="phase")
In [36]:
adata_r
Out[36]:
AnnData object with n_obs × n_vars = 2915 × 20156
    obs: 'n_genes', 'n_counts', 'n_genes_by_counts', 'total_counts', 'total_counts_mt', 'pct_counts_mt'
    var: 'gene_ids', 'feature_types', 'n_cells', 'mt', 'n_cells_by_counts', 'mean_counts', 'pct_dropout_by_counts', 'total_counts'
In [37]:
# sc.pp.regress_out(adata, 'phase')
# sc.pp.scale(adata, max_value=10)
In [42]:
tfh = []
th1 = []
treg = []
memory = []
cd4 = []
cd8= []

with open("/Users/bahawarsdhillon/Desktop/BIO 257 - Applied Genomics/Scanpy-Project/CD8_ARS.txt",'r') as myfile:
    for line in myfile:
        line=line.strip('\n')
        cd8.append(line)

with open("/Users/bahawarsdhillon/Desktop/BIO 257 - Applied Genomics/Scanpy-Project/CD4_Ars.txt",'r') as myfile:
    for line in myfile:
        line=line.strip('\n')
        cd4.append(line)

with open("/Users/bahawarsdhillon/Desktop/BIO 257 - Applied Genomics/Scanpy-Project/TH1_ARS.txt",'r') as myfile:
    for line in myfile:
        line=line.strip('\n')
        th1.append(line)

with open("/Users/bahawarsdhillon/Desktop/BIO 257 - Applied Genomics/Scanpy-Project/tfh_Ars.txt",'r') as myfile:
    for line in myfile:
        line=line.strip('\n')
        tfh.append(line)

with open("/Users/bahawarsdhillon/Desktop/BIO 257 - Applied Genomics/Scanpy-Project/Treg_ARS.txt",'r') as myfile:
    for line in myfile:
        line=line.strip('\n')
        treg.append(line)

with open("/Users/bahawarsdhillon/Desktop/BIO 257 - Applied Genomics/Scanpy-Project/Memory_ARS.txt",'r') as myfile:
    for line in myfile:
        line=line.strip('\n')
        memory.append(line)

        
In [44]:
print("TH1: \n", len(th1))
th1 = [x for x in th1 if x in adata.var_names]
print(len(th1))

print("TFH: \n", len(tfh))
tfh = [x for x in tfh if x in adata.var_names]
print(len(tfh))

print("CD4: \n", len(cd4))
cd4 = [x for x in cd4 if x in adata.var_names]
print(len(cd4))

print("CD8: \n", len(cd8))
cd8 = [x for x in cd8 if x in adata.var_names]
print(len(cd8))

print("TREG: \n", len(treg))
treg = [x for x in treg if x in adata.var_names]
print(len(treg))

print("Memory: \n", len(memory))
memory = [x for x in memory if x in adata.var_names]
print(len(memory))
TH1: 
 138
138
TFH: 
 72
61
CD4: 
 9
9
CD8: 
 146
129
TREG: 
 27
23
Memory: 
 33
29
In [462]:
print(treg)
['FOXP3', 'IKZF2', 'TNFRSF4', 'CAPG', 'TNFRSF18', 'CD74', 'CTLA4', 'RGS1', 'CCND2', 'SELL', 'SERINC3', 'SAMSN1', 'IFNGR1', 'GIMAP7', 'LTB', 'BTG1', 'IL7R', 'SDF4', 'CD2', 'SHISA5', 'GPX4', 'MBNL1', 'PELI1']
In [463]:
print(memory)
['RPS27', 'RPS23', 'RPS24', 'RPS14', 'RPS19', 'RPS29', 'RPS16', 'IL7R', 'RPL18A', 'RPS5', 'RPL37A', 'BTG1', 'RPS28', 'RPL36A', 'RPL13', 'RPLP2', 'RPS15A', 'RPL12', 'RPL36', 'RPL9', 'RPS9', 'RPLP1', 'RPS15', 'RPS21', 'RPL38', 'CCND2', 'MALAT1', 'RABAC1', 'EIF4A2']
In [465]:
print(tfh)
['PDCD1', 'RGS10', 'CXCR5', 'TOX', 'TOX2', 'PTRH1', 'ANGPTL2', 'MARCKSL1', 'SMCO4', 'TNFSF8', 'PPP1R14B', 'TNFAIP8', 'HIF1A', 'LPP', 'MAF', 'CD160', 'SH2D1A', 'CD200', 'TBC1D4', 'TPI1', 'RPSA', 'HMGB1', 'ICOS', 'GAPDH', 'PRKCA', 'PTPRCAP', 'ZAP70', 'PTPN11', 'DENND2D', 'ZFP36L1', 'FYN', 'GDI2', 'LIMD2', 'PKM', 'NT5E', 'CTSB', 'PFKL', 'PGAM1', 'MATK', 'TRIM8', 'COX14', 'ASAP1', 'GNG2', 'P2RX7', 'LRMP', 'CD3G', 'ALDOA', 'GNA13', 'ISG15', 'RAB37', 'MMD', 'FAM162A', 'BORCS8', 'DDIT4', 'PTP4A2', 'CD82', 'MIF', 'PFKP', 'GIMAP5', 'EEA1', 'BATF']
In [46]:
allgenes = adata.var_names
iglist = []
for i in range(1, len(allgenes)):
    if allgenes[i].startswith("IG"):
        print(allgenes[i])
        iglist.append(allgenes[i])

sc.tl.score_genes(adata, iglist, score_name="IG")
list_BCells=['CD19', 'SYK', 'PIK3AP1', 'AKT1', 'AKT2', 'AKT3', 'CHUK', 'IKBKB', 'IKBKG']
sc.tl.score_genes(adata, iglist, score_name="bcell")
sc.pl.umap(adata, color=["IG", "bcell"])

sc.tl.score_genes(adata, th1, score_name="th1")
sc.tl.score_genes(adata, tfh, score_name="tfh")
sc.tl.score_genes(adata, treg, score_name="treg")
sc.tl.score_genes(adata, cd8, score_name="cd8")
sc.tl.score_genes(adata, cd4, score_name="cd4")
sc.tl.score_genes(adata, memory, score_name="memory")


sc.pl.umap(adata, color=["cd4", "cd8"])
sc.pl.umap(adata, color=["tfh", "th1", "treg", "memory"])
IGSF3
IGSF9
IGSF8
IGKC
IGKV4-1
IGKV3-15
IGKV3-20
IGFBP2
IGFBP5
IGSF10
IGF2BP2
IGFBP7
IGIP
IGF2R
IGF2BP3
IGFBP3
IGHEP2
IGF2
IGSF22
IGHMBP2
IGSF9B
IGFBP6
IGF1
IGHA2
IGHE
IGHG4
IGHG2
IGHGP
IGHA1
IGHEP1
IGHG1
IGHG3
IGHD
IGHM
IGHV1-2
IGHV1-3
IGHV3-7
IGHV3-23
IGHV1-24
IGHV3-30
IGHV1-69D
IGHV3-74
IGHV5-78
IGDCC3
IGDCC4
IGF1R
IGSF6
IGFBP4
IGF2BP1
IGFLR1
IGFL2
IGFL2-AS1
IGLON5
IGLV1-51
IGLV3-21
IGLV2-8
IGLC1
IGLC2
IGLC3
IGLC5
IGLC7
IGLL1
IGBP1
IGBP1-AS1
computing score 'IG'
    finished: added
    'IG', score of gene set (adata.obs).
    1146 total control genes are used. (0:00:00)
computing score 'bcell'
    finished: added
    'bcell', score of gene set (adata.obs).
    1146 total control genes are used. (0:00:00)
computing score 'th1'
    finished: added
    'th1', score of gene set (adata.obs).
    841 total control genes are used. (0:00:00)
computing score 'tfh'
    finished: added
    'tfh', score of gene set (adata.obs).
    649 total control genes are used. (0:00:00)
computing score 'treg'
    finished: added
    'treg', score of gene set (adata.obs).
    249 total control genes are used. (0:00:00)
computing score 'cd8'
    finished: added
    'cd8', score of gene set (adata.obs).
    938 total control genes are used. (0:00:00)
computing score 'cd4'
    finished: added
    'cd4', score of gene set (adata.obs).
    149 total control genes are used. (0:00:00)
computing score 'memory'
    finished: added
    'memory', score of gene set (adata.obs).
    97 total control genes are used. (0:00:00)
In [54]:
sc.pl.umap(adata, color="CD4")
sc.pl.umap(adata, color="CD8A", size=4)
In [ ]:
 
In [50]:
print(cd8)
['GZMA', 'CCL5', 'GZMB', 'KLRD1', 'NKG7', 'CD8A', 'KLRK1', 'LGALS1', 'KLRC1', 'KLRG1', 'LGALS3', 'GZMK', 'CX3CR1', 'PLAC8', 'ZEB2', 'CTSD', 'H2AFZ', 'EPSTI1', 'DNAJC15', 'CD48', 'CTSW', 'SELPLG', 'TMSB4X', 'PLEK', 'TM6SF1', 'S100A10', 'VIM', 'ZYX', 'ABRACL', 'CRIP1', 'RACGAP1', 'CYBA', 'CCR2', 'PYCARD', 'SUB1', 'RAP1B', 'GLIPR2', 'EMP3', 'KRTCAP2', 'ID2', 'RPA2', 'STMN1', 'S100A6', 'AHNAK', 'DAPK2', 'OSTF1', 'ITGAX', 'ANXA6', 'CD47', 'ARL4C', 'RUNX3', 'IFNGR1', 'LFNG', 'RASGRP2', 'REEP5', 'CHSY1', 'IL18RAP', 'JAK1', 'SGK1', 'KLRC2', 'TBX21', 'SEMA4A', 'SLAMF7', 'TXNDC5', 'ST3GAL6', 'ACTG1', 'CCNB2', 'CTSC', 'BIRC5', 'ITGB2', 'SP100', 'LAMB3', 'UBE2C', 'ANXA2', 'CENPW', 'CALM1', 'COX5B', 'COX5A', 'SNX10', 'EFHD2', 'LSP1', 'SERPINB9', 'HMGB2', 'THY1', 'RBM3', 'SNX5', 'PPP3CA', 'ARL6IP5', 'CKS1B', 'SEC61B', 'GNA15', 'NPM3', 'UBE2G2', 'ACTB', 'UGCG', 'S1PR4', 'CLIC1', 'LMNB1', 'CKS2', 'TPM4', 'TAGLN2', 'NDUFB7', 'PTP4A3', 'H2AFY', 'HMGN2', 'YWHAQ', 'GGH', 'DNAJC9', 'CMPK1', 'GIMAP7', 'CCND3', 'HCST', 'SMC2', 'ANAPC5', 'DEK', 'LSM5', 'CMTM7', 'PCNA', 'H2AFV', 'RAN', 'ANP32E', 'CENPA', 'SLBP', 'DUT', 'TMPO', 'H2AFX', 'TUBB4B', 'UBE2S', 'TUBA1B']
In [51]:
naive = ["CD8A", "CD4"]
sc.tl.score_genes(adata, naive, score_name="naive")
sc.pl.umap(adata, color="naive")
computing score 'naive'
    finished: added
    'naive', score of gene set (adata.obs).
    100 total control genes are used. (0:00:00)
In [52]:
list_NKCells=["NCAM1","GZMK", "GNLY", "GZMA"]
sc.tl.score_genes(adata, list_NKCells, score_name="nk")

sc.pl.umap(adata, color=["leiden", "nk"], ncols=1)
computing score 'nk'
    finished: added
    'nk', score of gene set (adata.obs).
    150 total control genes are used. (0:00:00)
In [62]:
sc.pl.umap(adata, color="CD8A", size=25, color_map="magma")
In [63]:
sc.pl.umap(adata, color="NCAM1", size=25, color_map="magma" )
In [64]:
newmacs= ["ITGAM", "CD68", "CD163"]
sc.tl.score_genes(adata, newmacs, score_name="n macs")
sc.pl.umap(adata, color="n macs", size=25, color_map="magma")
computing score 'n macs'
    finished: added
    'n macs', score of gene set (adata.obs).
    150 total control genes are used. (0:00:00)
In [65]:
sc.pl.umap(adata, color="CD68", size=25, color_map="magma")
In [67]:
list_NaiveT=["CCR7", "IL7R", "LEF1"]
sc.tl.score_genes(adata, list_NaiveT, score_name="naiveT")
sc.pl.umap(adata, color="naiveT", size=25, color_map="magma")
computing score 'naiveT'
    finished: added
    'naiveT', score of gene set (adata.obs).
    150 total control genes are used. (0:00:00)
In [69]:
list_ActivatedT=["CD25", "CD69"]
sc.tl.score_genes(adata, list_ActivatedT, score_name="activeT")
sc.pl.umap(adata, color="activeT", size=25, color_map="magma")
computing score 'activeT'
WARNING: genes are not in var_names and ignored: ['CD25']
    finished: added
    'activeT', score of gene set (adata.obs).
    50 total control genes are used. (0:00:00)
In [362]:
list_TCells=["CD8", "CD3E", "CD4"]
sc.tl.score_genes(adata, list_TCells, score_name="allT")
sc.pl.umap(adata, color="allT", size=25, color_map="magma")
computing score 'allT'
WARNING: genes are not in var_names and ignored: ['CD8']
    finished: added
    'allT', score of gene set (adata.obs).
    100 total control genes are used. (0:00:00)
In [73]:

['CXCR6', 'CCR2', 'ID2', 'NKG7', 'LGALS3', 'PLAC8', 'GZMB', 'LGALS1', 'DNAJC15', 'S100A10', 'GNA15', 'EPSTI1', 'TMSB4X', 'CRIP1', 'ANXA1', 'S100A4', 'CCL5', 'GLIPR2', 'CTSW', 'SUB1', 'LSP1', 'SELPLG', 'ANXA6', 'AHNAK', 'S100A6', 'BHLHE40', 'CD48', 'PLEK', 'IFNGR1', 'VIM', 'ABRACL', 'CYBA', 'CD47', 'SH3BGRL3', 'ZYX', 'S100A13', 'PYCARD', 'CTSD', 'TAGLN2', 'IL18R1', 'ACTG1', 'ST3GAL6', 'OSTF1', 'ADGRE5', 'HMGB2', 'KLRC1', 'EMP3', 'CDC42', 'RORA', 'ITGB7', 'TPST2', 'TBX21', 'S1PR4', 'CALM1', 'MYL6', 'AGPAT4', 'IL18RAP', 'RAP1B', 'PPP3CA', 'CD52', 'RBM3', 'LFNG', 'RASGRP2', 'GLRX', 'KRTCAP2', 'CORO1A', 'RNF138', 'ITGA4', 'PODNL1', 'ARL6IP5', 'ARHGDIB', 'H2AFZ', 'S100A11', 'TXNDC5', 'CLIC1', 'CFL1', 'RUNX3', 'GABARAPL2', 'CROT', 'REEP5', 'ITGB2', 'SERPINB9', 'MRPL33', 'HSP90B1', 'MYL12A', 'GMFG', 'COX5A', 'THY1', 'ARPC5', 'MYO1F', 'PPIB', 'CTSC', 'ATP1A1', 'STMN1', 'COX17', 'CYTH4', 'ECH1', 'PPP1CA', 'PSMB3', 'SLAMF1', 'COX5B', 'ACTB', 'CAPZB', 'GAPDH', 'RPA2', 'NDUFB7', 'PRELID1', 'IL2RB', 'H2AFY', 'ACOT7', 'ANXA2', 'SPN', 'ENO1', 'CD2', 'GGH', 'NPTN', 'RNF166', 'TSPO', 'SEC61B', 'APRT', 'MYO1G', 'SEC61G', 'UBE2G2', 'ACTR3', 'MTPN', 'ESYT1', 'DOK2', 'SMS', 'CLTA', 'CDC42EP3', 'FAM107B', 'TPM4', 'TMED2', 'GIMAP7', 'IDH3A', 'AP2S1', 'TALDO1', 'ANXA5']
In [ ]:
list_BCells=['CD19', 'SYK', 'PIK3AP1', 'AKT1', 'AKT2', 'AKT3', 'CHUK', 'IKBKB', 'IKBKG', 'IGSF9', 'IGSF8', 'IGKC', 'IGKV3-15', 'IGHGP', 'IGFLR1', 'IGLV3-21', 'IGBP1-AS1']
print("IG: \n", len(iglist))
treg = [x for x in iglist if x in adata.uns.hvg]
print(len(iglist))
In [87]:
 
extracting highly variable genes
    finished (0:00:00)
In [98]:
hvgs = adata.var_vector(k="highly_variable")
allgenes = adata.var_names
In [114]:
thelists = []
for d in range (0, len(allgenes)):
    if (hvgs[d]==True):
        thelists.append(allgenes[d])
        
print(len(thelists))


cov=0
  File "<ipython-input-114-6fe652b3466a>", line 11
    if iglist[d] is in thelists:
                    ^
SyntaxError: invalid syntax
In [119]:
iglistv2=[]
for d in range (0,len(iglist)):
    if iglist[d] in thelists:
        iglistv2.append(iglist[d])

print(iglistv2)
['IGSF9', 'IGSF8', 'IGKC', 'IGKV3-15', 'IGHGP', 'IGFLR1', 'IGLV3-21', 'IGBP1-AS1']
In [122]:
cd8v2 = []
for i in range (0, len(cd8)):
    if (cd8[i] not in cd4):
        cd8v2.append(cd8[i])
print(cd8v2)
print(len(cd8))
print(len(cd8v2))
['GZMA', 'CCL5', 'GZMB', 'KLRD1', 'NKG7', 'CD8A', 'KLRK1', 'LGALS1', 'KLRC1', 'KLRG1', 'LGALS3', 'GZMK', 'CX3CR1', 'PLAC8', 'ZEB2', 'CTSD', 'H2AFZ', 'EPSTI1', 'DNAJC15', 'CD48', 'CTSW', 'SELPLG', 'TMSB4X', 'PLEK', 'TM6SF1', 'S100A10', 'VIM', 'ZYX', 'ABRACL', 'CRIP1', 'RACGAP1', 'CYBA', 'CCR2', 'PYCARD', 'SUB1', 'RAP1B', 'GLIPR2', 'EMP3', 'KRTCAP2', 'ID2', 'RPA2', 'STMN1', 'AHNAK', 'DAPK2', 'OSTF1', 'ITGAX', 'ANXA6', 'CD47', 'ARL4C', 'RUNX3', 'IFNGR1', 'LFNG', 'RASGRP2', 'REEP5', 'CHSY1', 'IL18RAP', 'JAK1', 'SGK1', 'KLRC2', 'TBX21', 'SEMA4A', 'SLAMF7', 'TXNDC5', 'ST3GAL6', 'CCNB2', 'CTSC', 'BIRC5', 'ITGB2', 'SP100', 'LAMB3', 'UBE2C', 'ANXA2', 'CENPW', 'CALM1', 'COX5B', 'COX5A', 'SNX10', 'EFHD2', 'LSP1', 'SERPINB9', 'HMGB2', 'THY1', 'RBM3', 'SNX5', 'PPP3CA', 'ARL6IP5', 'CKS1B', 'SEC61B', 'GNA15', 'NPM3', 'UBE2G2', 'ACTB', 'UGCG', 'S1PR4', 'CLIC1', 'LMNB1', 'CKS2', 'TPM4', 'TAGLN2', 'NDUFB7', 'PTP4A3', 'H2AFY', 'HMGN2', 'YWHAQ', 'GGH', 'DNAJC9', 'CMPK1', 'GIMAP7', 'CCND3', 'HCST', 'SMC2', 'ANAPC5', 'DEK', 'LSM5', 'CMTM7', 'PCNA', 'H2AFV', 'RAN', 'ANP32E', 'CENPA', 'SLBP', 'DUT', 'TMPO', 'H2AFX', 'TUBB4B', 'UBE2S', 'TUBA1B']
129
127
In [123]:
cd4v2 = []
for i in range (0, len(cd4)):
    if (cd4[i] not in cd8):
        cd4v2.append(cd4[i])

print(len(cd4))
print(len(cd4v2))
9
7
In [91]:
nks = ["GNLY", "NKG7", "KLRB1"]
sc.tl.score_genes(adata, nks, score_name="nks")
sc.pl.umap(adata, color="nks")
computing score 'nks'
    finished: added
    'nks', score of gene set (adata.obs).
    150 total control genes are used. (0:00:00)
In [303]:
list_BIG=['CD19', 'SYK', 'PIK3AP1', 'AKT1', 'AKT2', 'AKT3', 'CHUK', 'IKBKB', 'IKBKG', 'IGSF9', 'IGSF8', 'IGKC', 'IGKV3-15', 'IGHGP', 'IGFLR1', 'IGLV3-21', 'IGBP1-AS1']


sc.tl.leiden(adata, resolution=0.3)
sc.tl.rank_genes_groups(adata, "leiden", method="wilcoxon")
sc.tl.marker_gene_overlap(adata, allmarkers)
running Leiden clustering
    finished: found 11 clusters and added
    'leiden', the cluster labels (adata.obs, categorical) (0:00:00)
ranking genes
    finished: added to `.uns['rank_genes_groups']`
    'names', sorted np.recarray to be indexed by group ids
    'scores', sorted np.recarray to be indexed by group ids
    'logfoldchanges', sorted np.recarray to be indexed by group ids
    'pvals', sorted np.recarray to be indexed by group ids
    'pvals_adj', sorted np.recarray to be indexed by group ids (0:00:05)
Out[303]:
0 1 2 3 4 5 6 7 8 9 10
Tfh 61.0 61.0 61.0 61.0 61.0 61.0 61.0 61.0 61.0 61.0 61.0
Th1 138.0 138.0 138.0 138.0 138.0 138.0 138.0 138.0 138.0 138.0 138.0
NK 3.0 3.0 3.0 3.0 3.0 3.0 3.0 3.0 3.0 3.0 3.0
B 17.0 17.0 17.0 17.0 17.0 17.0 17.0 17.0 17.0 17.0 17.0
CD4 7.0 7.0 7.0 7.0 7.0 7.0 7.0 7.0 7.0 7.0 7.0
CD8 127.0 127.0 127.0 127.0 127.0 127.0 127.0 127.0 127.0 127.0 127.0
4 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0
8 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0
In [270]:
allmarkers = {"Tfh": tfh, "Th1": th1, "NK": list_NKCells, "B": list_BIG, "CD4": cd4v2, "CD8": cd8v2, "4": ["CD4"], "8": ["CD8A"]}
In [271]:
sc.tl.marker_gene_overlap(adata, allmarkers, top_n_markers=50, normalize="reference")
Out[271]:
0 1 2 3 4 5 6 7 8 9 10 11 12 13
Tfh 0.081967 0.000000 0.0 0.000000 0.049180 0.016393 0.000000 0.000000 0.065574 0.016393 0.016393 0.032787 0.016393 0.016393
Th1 0.007246 0.007246 0.0 0.050725 0.050725 0.036232 0.000000 0.050725 0.094203 0.021739 0.072464 0.050725 0.021739 0.057971
NK 0.000000 0.000000 0.0 0.000000 0.000000 0.000000 0.000000 1.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000
B 0.000000 0.117647 0.0 0.000000 0.000000 0.000000 0.117647 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000
CD4 0.000000 0.142857 0.0 0.142857 0.142857 0.000000 0.000000 0.000000 0.142857 0.000000 0.000000 0.000000 0.000000 0.142857
CD8 0.000000 0.000000 0.0 0.015748 0.015748 0.039370 0.000000 0.094488 0.047244 0.110236 0.062992 0.031496 0.031496 0.031496
4 0.000000 0.000000 0.0 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000
8 0.000000 0.000000 0.0 0.000000 0.000000 0.000000 0.000000 1.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000
In [304]:
sc.pl.umap(adata, color="leiden")
In [273]:
sc.tl.score_genes(adata, list_BIG, score_name="Bv2")
sc.pl.umap(adata, color="Bv2")
computing score 'Bv2'
    finished: added
    'Bv2', score of gene set (adata.obs).
    499 total control genes are used. (0:00:00)
In [274]:
pd.DataFrame(adata.uns['rank_genes_groups']['names']).head(10)
Out[274]:
0 1 2 3 4 5 6 7 8 9 10 11 12 13
0 CYTOR MS4A1 RPS12 IL7R CXCL13 FCER1G TNFRSF13C CCL5 GAPDH MKI67 CST3 RPS7 GZMB AL157895.1
1 HLA-A IGHM RPL32 TPT1 AC004585.1 FTL BANK1 NKG7 ACTB STMN1 IFITM3 CCL17 SERPINF1 TPSAB1
2 IL32 CD79A RPL13 PGM2L1 SESN1 PSAP CD79A CTSW PFN1 HMGB2 VIM TMSB4X IRF7 TPSB2
3 CTLA4 CD37 RPS3A GPR171 RBPJ TIMP1 CD83 GZMK IL32 HMGN2 CD63 LGALS1 IRF4 HPGDS
4 DUSP4 IGHD RPL10 KLRB1 LINC01871 TYROBP MS4A1 KLRK1 PKM HMGB1 HLA-DRA RACK1 GPR183 CMA1
5 B2M CD83 RPS3 RNF19A KIAA0319L LYZ BASP1 CST7 SUB1 TUBB FKBP1A GSTP1 CLIC3 CPA3
6 TRAC CD74 RPL34 NR3C1 ID2 AIF1 CD37 HCST CFL1 TUBA1B ANXA2 TXN EGLN3 KIT
7 HLA-B FCER2 RPS6 RPS15A ARMH1 IFI30 HLA-DQA1 GZMA RANBP1 ASPM HLA-DPB1 RPL17 JCHAIN GATA2
8 MIR4435-2HG HLA-DQA1 RPL3 SESN1 PGM2L1 PLAUR NFKBID AC020916.1 ENO1 TPX2 HLA-DPA1 NPM1 LILRB4 HDC
9 ARID5B HLA-DRA RPL21 ANXA1 RNASET2 C5AR1 CD74 FOSB SUMO2 CENPF GSN BCL2A1 TCF4 IL1RL1
In [275]:
sc.pl.umap(adata, color="leiden", size=25)
In [276]:
adata
Out[276]:
AnnData object with n_obs × n_vars = 2359 × 20156
    obs: 'n_genes', 'n_counts', 'n_genes_by_counts', 'total_counts', 'total_counts_mt', 'pct_counts_mt', 'S_score', 'G2M_score', 'phase', 'leiden', 'IG', 'bcell', 'th1', 'tfh', 'treg', 'cd8', 'cd4', 'memory', 'naive', 'nk', 'n macs', 'naiveT', 'activeT', 'allT', 'nks', 'Bv2', 'HRS', 'myscore'
    var: 'gene_ids', 'feature_types', 'n_cells', 'mt', 'n_cells_by_counts', 'mean_counts', 'pct_dropout_by_counts', 'total_counts', 'highly_variable', 'means', 'dispersions', 'dispersions_norm', 'mean', 'std'
    uns: 'log1p', 'hvg', 'pca', 'neighbors', 'umap', 'leiden', 'leiden_colors', 'phase_colors', 'rank_genes_groups'
    obsm: 'X_pca', 'X_umap'
    varm: 'PCs'
    obsp: 'distances', 'connectivities'
In [277]:
sc.pl.umap(adata, color=["n macs", "allT"])
In [278]:
sc.pl.umap(adata, color="phase")
In [279]:
sc.pl.umap(adata, color="total_counts")
In [280]:
sc.pl.umap(adata, color=["CD4","CD163","n_genes"], color_map="magma")
In [281]:
list_HRS= ["TNFRSF8", "CD274", "IRF4", "PAX5", "FUT4"]
sc.tl.score_genes(adata, list_HRS, score_name="HRS")
sc.pl.umap(adata, color="HRS", color_map="magma")
computing score 'HRS'
    finished: added
    'HRS', score of gene set (adata.obs).
    200 total control genes are used. (0:00:00)
In [282]:
sc.pl.umap(adata, color=["CD3E", "allT"], color_map="magma")
In [283]:
sc.pl.umap(adata, color=["IL2RA", "CD69"], color_map="magma")
In [284]:
sc.pl.umap(adata, color=["CCR7", "IL7R", "LEF1"], color_map="magma")
In [285]:
sc.pl.umap(adata, color=["IL2RA", "FOXP3", "CTLA4", "LAG3", "TNFRSF18", "IKZF2", "IKZF4", "LGMN"], color_map="magma")
In [286]:
adata
Out[286]:
AnnData object with n_obs × n_vars = 2359 × 20156
    obs: 'n_genes', 'n_counts', 'n_genes_by_counts', 'total_counts', 'total_counts_mt', 'pct_counts_mt', 'S_score', 'G2M_score', 'phase', 'leiden', 'IG', 'bcell', 'th1', 'tfh', 'treg', 'cd8', 'cd4', 'memory', 'naive', 'nk', 'n macs', 'naiveT', 'activeT', 'allT', 'nks', 'Bv2', 'HRS', 'myscore'
    var: 'gene_ids', 'feature_types', 'n_cells', 'mt', 'n_cells_by_counts', 'mean_counts', 'pct_dropout_by_counts', 'total_counts', 'highly_variable', 'means', 'dispersions', 'dispersions_norm', 'mean', 'std'
    uns: 'log1p', 'hvg', 'pca', 'neighbors', 'umap', 'leiden', 'leiden_colors', 'phase_colors', 'rank_genes_groups'
    obsm: 'X_pca', 'X_umap'
    varm: 'PCs'
    obsp: 'distances', 'connectivities'
In [287]:
list_Macrophages=["CD68", "ITGAM", "MPEG1", "CD163", "IRF7", "IFI44"]
macrophagedict = {"Macrophage": list_Macrophages}
sc.pl.matrixplot(adata, macrophagedict, groupby='leiden')
In [288]:
sc.pl.umap(adata, color=["IL9R", "CD33", "IL5RA"], color_map="magma")
In [289]:
sc.pl.violin(adata, keys=["IL9R", "CD33", "IL5RA"], groupby="leiden")
In [290]:
sc.pl.violin(adata, keys=["CD4", "CD8A"], groupby="leiden")
In [291]:
adata
Out[291]:
AnnData object with n_obs × n_vars = 2359 × 20156
    obs: 'n_genes', 'n_counts', 'n_genes_by_counts', 'total_counts', 'total_counts_mt', 'pct_counts_mt', 'S_score', 'G2M_score', 'phase', 'leiden', 'IG', 'bcell', 'th1', 'tfh', 'treg', 'cd8', 'cd4', 'memory', 'naive', 'nk', 'n macs', 'naiveT', 'activeT', 'allT', 'nks', 'Bv2', 'HRS', 'myscore'
    var: 'gene_ids', 'feature_types', 'n_cells', 'mt', 'n_cells_by_counts', 'mean_counts', 'pct_dropout_by_counts', 'total_counts', 'highly_variable', 'means', 'dispersions', 'dispersions_norm', 'mean', 'std'
    uns: 'log1p', 'hvg', 'pca', 'neighbors', 'umap', 'leiden', 'leiden_colors', 'phase_colors', 'rank_genes_groups'
    obsm: 'X_pca', 'X_umap'
    varm: 'PCs'
    obsp: 'distances', 'connectivities'
In [292]:
sc.pl.umap(adata, color=["leiden", "CD8A", "CD4"], color_map="magma")
In [293]:
sc.pl.umap(adata, color=["leiden", "PDCD1", "CXCR5", "BCL6", "CD3E", "CD4"], color_map="magma")
In [294]:
sc.pl.umap(adata, color=["leiden", "IL2RA", "CD4", "CD8A"], color_map="magma")
In [295]:
sc.tl.score_genes(adata, ["IL2RA", "CD4", "CD8A"], score_name="myscore")
sc.pl.umap(adata, color="myscore", color_map="magma")
computing score 'myscore'
    finished: added
    'myscore', score of gene set (adata.obs).
    100 total control genes are used. (0:00:00)
In [296]:
sc.pl.violin(adata, keys=["th1", "tfh", "treg", "memory", "cd4", "cd8"], groupby="leiden")
In [301]:
sc.pl.violin(adata, keys=["CD19", "CR2", "CD83", "NT5E"] , groupby="leiden")
In [302]:
sc.pl.umap(adata, color=["leiden", "CD19", "CR2", "CD83", "NT5E"], color_map="magma")
In [363]:
sc.pl.umap(adata, color="leiden", color_map="magma")
In [364]:
t = adata[adata.obs['leiden'].isin(["0", "1", "3", "5", "6", "8"])]
sc.pl.umap(t, color="leiden")
In [365]:
t2 = t
sc.pp.highly_variable_genes(t2, min_mean=0.0125, max_mean=3, min_disp=0.5)
sc.pl.highly_variable_genes(t2)
t2.raw = t2
sc.tl.pca(t, svd_solver='arpack')
sc.tl.leiden(t2, resolution=0.4)
sc.tl.rank_genes_groups(t2, 'leiden', method='wilcoxon')
sc.pl.rank_genes_groups(t2, n_genes=25, sharey=False)
pd.DataFrame(t2.uns['rank_genes_groups']['names']).head(10)
extracting highly variable genes
    finished (0:00:00)
Trying to set attribute `.uns` of view, copying.
--> added
    'highly_variable', boolean vector (adata.var)
    'means', float vector (adata.var)
    'dispersions', float vector (adata.var)
    'dispersions_norm', float vector (adata.var)
computing PCA
    on highly variable genes
    with n_comps=50
    finished (0:00:00)
running Leiden clustering
    finished: found 6 clusters and added
    'leiden', the cluster labels (adata.obs, categorical) (0:00:00)
ranking genes
    finished: added to `.uns['rank_genes_groups']`
    'names', sorted np.recarray to be indexed by group ids
    'scores', sorted np.recarray to be indexed by group ids
    'logfoldchanges', sorted np.recarray to be indexed by group ids
    'pvals', sorted np.recarray to be indexed by group ids
    'pvals_adj', sorted np.recarray to be indexed by group ids (0:00:06)
In [366]:
pd.DataFrame(t2.uns['rank_genes_groups']['names']).head(10)
Out[366]:
0 1 2 3 4 5
0 HLA-A AC004585.1 RPS12 STMN1 NKG7 EHD2
1 CYTOR SESN1 RPL10 TUBA1B GZMA COX7A1
2 MIR4435-2HG PGM2L1 RPS3 TUBB GZMB CAV2
3 CTLA4 RNASET2 RPL32 HMGB1 CST7 SPARC
4 PMAIP1 NR3C1 RPL13 H2AFZ PRF1 CNN3
5 IL32 CXCL13 RPL21 HMGN2 CCL4 CEBPD
6 HLA-B TXNIP RPL3 HMGB2 CCL5 SNCG
7 ARID5B VIM RPS3A CENPM HCST RBPMS
8 B2M RBPJ RPS27A TYMS HAVCR2 NR2F2
9 DUSP4 LINC01871 RPL34 CKS1B GZMK SPARCL1
In [367]:
sc.pp.neighbors(t2, n_neighbors=10, n_pcs=40)
sc.tl.umap(t2)
sc.pl.umap(t2, color="CD8A", color_map="magma")
computing neighbors
    using 'X_pca' with n_pcs = 40
    finished: added to `.uns['neighbors']`
    `.obsp['distances']`, distances for each pair of neighbors
    `.obsp['connectivities']`, weighted adjacency matrix (0:00:00)
computing UMAP
    finished: added
    'X_umap', UMAP coordinates (adata.obsm) (0:00:03)
In [368]:
sc.pl.umap(t2, color="CD4", color_map="magma")
In [369]:
sc.pl.umap(t2, color="leiden", color_map="magma")
In [370]:
sc.pl.umap(t2, color="phase")
In [371]:
sc.tl.score_genes(t2, cd4v2, score_name="cd4T")
sc.tl.score_genes(t2, cd8v2, score_name="cd8T")
sc.tl.score_genes(t2, th1, score_name="th1T")
sc.tl.score_genes(t2, treg, score_name="tregT")
sc.tl.score_genes(t2, memory, score_name="memoryT")

sc.pl.umap(t2, color=["cd4T", "cd8T"], color_map="magma")
sc.pl.umap(t2, color=["th1T", "tregT", "memoryT"], color_map="magma")
computing score 'cd4T'
    finished: added
    'cd4T', score of gene set (adata.obs).
    200 total control genes are used. (0:00:00)
computing score 'cd8T'
    finished: added
    'cd8T', score of gene set (adata.obs).
    1043 total control genes are used. (0:00:00)
computing score 'th1T'
    finished: added
    'th1T', score of gene set (adata.obs).
    1091 total control genes are used. (0:00:00)
computing score 'tregT'
    finished: added
    'tregT', score of gene set (adata.obs).
    348 total control genes are used. (0:00:00)
computing score 'memoryT'
    finished: added
    'memoryT', score of gene set (adata.obs).
    844 total control genes are used. (0:00:00)
In [372]:
list_Th1=["IFNG", "TBX21", "CD4"]
sc.tl.score_genes(t2, list_Th1, score_name="Th1v2")
sc.pl.umap(t2, color="Th1v2", color_map="magma")
computing score 'Th1v2'
    finished: added
    'Th1v2', score of gene set (adata.obs).
    100 total control genes are used. (0:00:00)
In [326]:
sc.pl.umap(t2, color=["leiden", "CD4", "CD8A"], color_map="magma")
In [373]:
sc.pl.violin(t2, keys=["IFNG", "TBX21"], groupby="leiden")
In [376]:
sc.pl.violin(t2, keys=["S_score","G2M_score", "n_genes"], groupby="leiden")
In [375]:
t2
Out[375]:
AnnData object with n_obs × n_vars = 1617 × 20156
    obs: 'n_genes', 'n_counts', 'n_genes_by_counts', 'total_counts', 'total_counts_mt', 'pct_counts_mt', 'S_score', 'G2M_score', 'phase', 'leiden', 'allT', 'cd4T', 'cd8T', 'th1T', 'tregT', 'memoryT', 'Th1v2'
    var: 'gene_ids', 'feature_types', 'n_cells', 'mt', 'n_cells_by_counts', 'mean_counts', 'pct_dropout_by_counts', 'total_counts', 'highly_variable', 'means', 'dispersions', 'dispersions_norm', 'mean', 'std'
    uns: 'log1p', 'hvg', 'pca', 'neighbors', 'umap', 'leiden', 'leiden_colors', 'phase_colors', 'rank_genes_groups'
    obsm: 'X_pca', 'X_umap'
    varm: 'PCs'
    obsp: 'distances', 'connectivities'
In [377]:
t3 = t2[t2.obs['leiden'].isin(["0", "1", "2", "4", "5"])]
In [ ]:
sc.pp.highly_variable_genes(t3, min_mean=0.0125, max_mean=3, min_disp=0.5)
t3.raw = t3
sc.tl.pca(t, svd_solver='arpack')
In [403]:
sc.tl.leiden(t3, resolution=0.3)
sc.tl.rank_genes_groups(t3, 'leiden', method='wilcoxon')
pd.DataFrame(t3.uns['rank_genes_groups']['names']).head(10)
running Leiden clustering
    finished: found 3 clusters and added
    'leiden', the cluster labels (adata.obs, categorical) (0:00:00)
ranking genes
    finished: added to `.uns['rank_genes_groups']`
    'names', sorted np.recarray to be indexed by group ids
    'scores', sorted np.recarray to be indexed by group ids
    'logfoldchanges', sorted np.recarray to be indexed by group ids
    'pvals', sorted np.recarray to be indexed by group ids
    'pvals_adj', sorted np.recarray to be indexed by group ids (0:00:04)
Out[403]:
0 1 2
0 RPL34 CTLA4 CEBPD
1 RPS27 SH2D1A CD63
2 RPL30 B2M NFIB
3 RPL13 TRAC GNG12
4 RPS3A HLA-A COL4A2
5 RPL39 IL32 TSPAN4
6 RPL10 ACTB CCN1
7 RPS3 CD27 COL4A1
8 RPL32 DUSP4 NNMT
9 RPL11 HLA-B IGFBP7
In [404]:
sc.pl.violin(t3, keys=["CD4", "CD8A"], groupby="leiden")
In [405]:
sc.pl.violin(t3, keys=["GZMK", "GZMB", "GNLY"], groupby="leiden")
In [406]:
sc.pl.umap(t3, color="leiden")
In [407]:
sc.pl.violin(t3, keys=["IL2RA", "CD69"], groupby="leiden")
In [408]:
sc.pl.umap(t, color="leiden")
In [409]:
t4 = adata[adata.obs['leiden'].isin(["0", "1", "3", "5", "6", "8"])]
sc.pl.umap(t4, color=["leiden", "CD4", "CD8A"], color_map="magma")
In [424]:
sc.tl.score_genes(t4, ["CD8A"], score_name="CD8X")
t4
computing score 'CD8X'
    finished: added
    'CD8X', score of gene set (adata.obs).
    50 total control genes are used. (0:00:00)
Out[424]:
AnnData object with n_obs × n_vars = 1617 × 20156
    obs: 'n_genes', 'n_counts', 'n_genes_by_counts', 'total_counts', 'total_counts_mt', 'pct_counts_mt', 'S_score', 'G2M_score', 'phase', 'leiden', 'allT', 'CD8A', 'CD8X'
    var: 'gene_ids', 'feature_types', 'n_cells', 'mt', 'n_cells_by_counts', 'mean_counts', 'pct_dropout_by_counts', 'total_counts', 'highly_variable', 'means', 'dispersions', 'dispersions_norm', 'mean', 'std'
    uns: 'log1p', 'hvg', 'pca', 'neighbors', 'umap', 'leiden', 'leiden_colors', 'phase_colors'
    obsm: 'X_pca', 'X_umap'
    varm: 'PCs'
    obsp: 'distances', 'connectivities'
In [425]:
sc8 = t4.obs_vector(k='CD8X')
print(np.median(sc8))
print(np.std(sc8))
print(np.median(sc8)+ np.std(sc8))
-0.20651205
0.637557
0.431045
In [418]:
t4.obs.columns
Out[418]:
Index(['n_genes', 'n_counts', 'n_genes_by_counts', 'total_counts',
       'total_counts_mt', 'pct_counts_mt', 'S_score', 'G2M_score', 'phase',
       'leiden', 'allT', 'CD8A'],
      dtype='object')
In [431]:
cd8 = t4[t4.obs.CD8X > 0.431045, :]
cd8
sc.pl.umap(t4, color="leiden")
sc.pl.umap(cd8, color="leiden")
In [430]:
sc.tl.leiden(cd8, resolution=0.6)
sc.pl.umap(cd8, color=['leiden'])
sc.tl.rank_genes_groups(cd8, 'leiden', method='wilcoxon')
pd.DataFrame(cd8.uns['rank_genes_groups']['names']).head(5)
running Leiden clustering
    finished: found 3 clusters and added
    'leiden', the cluster labels (adata.obs, categorical) (0:00:00)
ranking genes
    finished: added to `.uns['rank_genes_groups']`
    'names', sorted np.recarray to be indexed by group ids
    'scores', sorted np.recarray to be indexed by group ids
    'logfoldchanges', sorted np.recarray to be indexed by group ids
    'pvals', sorted np.recarray to be indexed by group ids
    'pvals_adj', sorted np.recarray to be indexed by group ids (0:00:00)
Out[430]:
0 1 2
0 RPS27 CTLA4 MT-ND4
1 RPL21 HERPUD1 MT-CO2
2 RPL17 DUSP4 MT-ND3
3 RPS28 TRAC RPL3
4 RPS23 GAPDH VIM
In [432]:
sc.tl.score_genes(t4, ["CD4"], score_name="CD4X")
sc4 = t4.obs_vector(k='CD4X')
print(np.median(sc4))
print(np.std(sc4))
print(np.median(sc4)+ np.std(sc4))
computing score 'CD4X'
    finished: added
    'CD4X', score of gene set (adata.obs).
    50 total control genes are used. (0:00:00)
-0.34476745
0.5185398
0.17377234
In [433]:
cd4 = t4[t4.obs.CD4X > 0.17377234, :]
cd4
sc.pl.umap(t4, color="leiden")
sc.pl.umap(cd4, color="leiden")
In [445]:
sc.pl.umap(cd4, color="CD8A", color_map="magma")
---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
<ipython-input-445-e730e9131adb> in <module>
----> 1 sc.pl.umap(cd4, color="CD8A", color_map="magma")

~/miniconda3/lib/python3.8/site-packages/scanpy/plotting/_tools/scatterplots.py in umap(adata, **kwargs)
    613     If `show==False` a :class:`~matplotlib.axes.Axes` or a list of it.
    614     """
--> 615     return embedding(adata, 'umap', **kwargs)
    616 
    617 

~/miniconda3/lib/python3.8/site-packages/scanpy/plotting/_tools/scatterplots.py in embedding(adata, basis, color, gene_symbols, use_raw, sort_order, edges, edges_width, edges_color, neighbors_key, arrows, arrows_kwds, groups, components, layer, projection, img_key, crop_coord, alpha_img, bw, library_id, color_map, palette, size, frameon, legend_fontsize, legend_fontweight, legend_loc, legend_fontoutline, vmax, vmin, add_outline, outline_width, outline_color, ncols, hspace, wspace, title, show, save, ax, return_fig, **kwargs)
    226         itertools.product(color, idx_components)
    227     ):
--> 228         color_vector, categorical = _get_color_values(
    229             adata,
    230             value_to_plot,

~/miniconda3/lib/python3.8/site-packages/scanpy/plotting/_tools/scatterplots.py in _get_color_values(adata, value_to_plot, groups, palette, use_raw, gene_symbols, layer)
   1037         values = adata.raw.obs_vector(value_to_plot)
   1038     else:
-> 1039         values = adata.obs_vector(value_to_plot, layer=layer)
   1040 
   1041     ###

~/miniconda3/lib/python3.8/site-packages/anndata/_core/anndata.py in obs_vector(self, k, layer)
   1364                 )
   1365                 layer = None
-> 1366         return get_vector(self, k, "obs", "var", layer=layer)
   1367 
   1368     def var_vector(self, k, *, layer: Optional[str] = None) -> np.ndarray:

~/miniconda3/lib/python3.8/site-packages/anndata/_core/index.py in get_vector(adata, k, coldim, idxdim, layer)
    156 
    157     if (in_col + in_idx) == 2:
--> 158         raise ValueError(
    159             f"Key {k} could be found in both .{idxdim}_names and .{coldim}.columns"
    160         )

ValueError: Key CD8A could be found in both .var_names and .obs.columns
In [446]:
t4
Out[446]:
AnnData object with n_obs × n_vars = 1617 × 20156
    obs: 'n_genes', 'n_counts', 'n_genes_by_counts', 'total_counts', 'total_counts_mt', 'pct_counts_mt', 'S_score', 'G2M_score', 'phase', 'leiden', 'allT', 'CD8A', 'CD8X', 'CD4X'
    var: 'gene_ids', 'feature_types', 'n_cells', 'mt', 'n_cells_by_counts', 'mean_counts', 'pct_dropout_by_counts', 'total_counts', 'highly_variable', 'means', 'dispersions', 'dispersions_norm', 'mean', 'std'
    uns: 'log1p', 'hvg', 'pca', 'neighbors', 'umap', 'leiden', 'leiden_colors', 'phase_colors'
    obsm: 'X_pca', 'X_umap'
    varm: 'PCs'
    obsp: 'distances', 'connectivities'
In [447]:
t_cytotoxic=["GZMK", "GNLY", "GZMA"]
t_Tregs=["IL2RA", "FOXP3", "CTLA4", "LAG3", "TNFRSF18", "IKZF2", "IKZF4", "LGMN"]
t_Th1=["IFNG", "TBX21", "CD3E", "CD4"]
t_Th2=["GATA3", "CRTH2", "IL4", "IL13", "CD3E", "CD4"]
t_Th17=["CD161", "CCR4", "CD3E", "CD4"]
t_TFH=["PDCD1", "CXCR5", "BCL6", "CD3E", "CD4"]


sc.tl.score_genes(t4, t_Tregs, score_name="Tregs")
sc.tl.score_genes(t4, t_Th1, score_name="TH1")
sc.tl.score_genes(t4, t_Th17, score_name="TH17")
sc.tl.score_genes(t4, t_Th2, score_name="TH2")
sc.tl.score_genes(t4, t_TFH, score_name="TFH")
sc.tl.score_genes(t4, t_cytotoxic, score_name="cytotoxic")

sc.pl.violin(t4, keys=["Tregs","TH1", "TH2", "TH17", "cytotoxic", "TFH"], groupby="leiden")
computing score 'Tregs'
    finished: added
    'Tregs', score of gene set (adata.obs).
    250 total control genes are used. (0:00:00)
computing score 'TH1'
    finished: added
    'TH1', score of gene set (adata.obs).
    149 total control genes are used. (0:00:00)
computing score 'TH17'
WARNING: genes are not in var_names and ignored: ['CD161']
    finished: added
    'TH17', score of gene set (adata.obs).
    100 total control genes are used. (0:00:00)
computing score 'TH2'
WARNING: genes are not in var_names and ignored: ['CRTH2']
    finished: added
    'TH2', score of gene set (adata.obs).
    250 total control genes are used. (0:00:00)
computing score 'TFH'
    finished: added
    'TFH', score of gene set (adata.obs).
    200 total control genes are used. (0:00:00)
computing score 'cytotoxic'
    finished: added
    'cytotoxic', score of gene set (adata.obs).
    150 total control genes are used. (0:00:00)
In [448]:
sc.pl.violin(t4, keys=["Tregs","TH1", "TH2", "TH17", "cytotoxic", "TFH"], groupby="leiden")
In [450]:
sc.pl.umap(t4, color="leiden")
In [449]:
t_NaiveT=["CCR7", "IL7R", "LEF1"]
sc.tl.score_genes(t4, t_NaiveT, score_name="NaiveT")
sc.pl.violin(t4, keys="NaiveT", groupby="leiden")
computing score 'NaiveT'
    finished: added
    'NaiveT', score of gene set (adata.obs).
    99 total control genes are used. (0:00:00)
In [453]:
sc.pl.violin(t4, keys="NaiveT", groupby="leiden")
In [459]:
sc.pl.violin(t4, keys="Tregs", groupby="leiden")
In [452]:
sc.pl.violin(t4, keys="IL2RA", groupby="leiden")
In [455]:
sc.pl.violin(t4, keys="cytotoxic", groupby="leiden")
In [451]:
sc.pl.umap(adata, color="leiden")
In [457]:
sc.pl.violin(t4, keys=["CD4"], groupby="leiden")
In [458]:
sc.pl.violin(t4, keys=["IL2RA", "FOXP3", "CTLA4", "LAG3", "TNFRSF18"], groupby="leiden")
In [460]:
sc.pl.umap(adata, color="leiden")
In [ ]:
labelling = adata

new_cluster_names = [
    'CD4 T', 'CD14 Monocytes',
    'B', 'CD8 T',
    'NK', 'FCGR3A Monocytes',
    'Dendritic', 'Megakaryocytes']
adata.rename_categories('leiden', new_cluster_names)
In [461]:
t4.write("tcells.h5ad")
... storing 'feature_types' as categorical
In [ ]: